Slip 3

Q.1.   Write a python program to implement multiple Linear Regression for a house price 
dataset. Divide the dataset into training and testing data.

# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Step 2: Create a sample house price dataset
data = {
    'Area': [1200, 1500, 1000, 1800, 2400, 3000, 3500, 4000],
    'Bedrooms': [2, 3, 2, 3, 4, 4, 5, 5],
    'Bathrooms': [1, 2, 1, 2, 3, 3, 4, 4],
    'Stories': [1, 2, 1, 2, 2, 3, 3, 3],
    'Price': [200000, 250000, 180000, 300000, 400000, 500000, 550000, 600000]
}
df = pd.DataFrame(data)

# Step 3: Define features and target
X = df[['Area', 'Bedrooms', 'Bathrooms', 'Stories']]  # Independent variables
y = df['Price']  # Target variable

# Step 4: Split into train and test data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Step 5: Train the Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 6: Predict prices on test data
y_pred = model.predict(X_test)

# Step 7: Evaluate the model
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# Step 8: Display actual vs predicted
result_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(result_df)

# Step 9: Plot the predictions
plt.scatter(y_test, y_pred, color='blue')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted House Prices')
plt.grid(True)
plt.show()

Q.2.  Use dataset crash.csv is an accident survivor’s dataset portal for USA hosted by 
data.gov. The dataset contains passengers age and speed of vehicle (mph) at the time 
of impact and fate of passengers (1 for survived and 0 for not survived) after a crash.  
use logistic regression to decide if the age and speed can predict the survivability of the  
passengers. 

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
df = pd.read_csv('crash.csv')  # Make sure crash.csv is in your working directory
print(df.head())

print(df.info())
print(df.describe())
sns.pairplot(df, hue="Survived")
plt.show()

df = pd.read_csv('crash.csv')

print(df.info())

import seaborn as sns
sns.pairplot(df, hue='Survived')

# Features and Target
X = df[['Age', 'Speed']]     # Independent variables
y = df['Survived']           # Dependent variable

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

	
# Predict on test data
y_pred = model.predict(X_test)

# Evaluation Metrics
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Plotting decision boundary is harder in >2D, use correlation heatmap
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

